library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
DARWIN <- read.csv("~/GitHub/FCA/Data/DARWIN/DARWIN.csv")
rownames(DARWIN) <- DARWIN$ID
DARWIN$ID <- NULL
DARWIN$class <- 1*(DARWIN$class=="P")
print(table(DARWIN$class))
#>
#> 0 1
#> 85 89
DARWIN[,1:ncol(DARWIN)] <- sapply(DARWIN,as.numeric)
signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(DARWIN) %in% c("class"));
DARWIN[,whof] <- signedlog(DARWIN[,whof])
studyName <- "DARWIN"
dataframe <- DARWIN
outcome <- "class"
TopVariables <- 10
thro <- 0.80
cexheat = 0.15
Some libraries
library(psych)
library(whitening)
library("vioplot")
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 174 | 450 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 85 | 89 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
largeSet <- length(varlist) > 1000
Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns
### Some global cleaning
sdiszero <- apply(dataframe,2,sd) > 1.0e-16
dataframe <- dataframe[,sdiszero]
varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
dataframe <- dataframe[,tokeep]
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
if (!largeSet)
{
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
srtCol=45,
srtRow=45,
cexCol=cexheat,
cexRow=cexheat
)
par(op)
}
The heat map of the data
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
#cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.9992136
DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#>
#> Included: 450 , Uni p: 0.006350853 , Uncorrelated Base: 241 , Outcome-Driven Size: 0 , Base Size: 241
#>
#>
1 <R=0.999,w= 1,N= 82>, Top: 41( 1 )[ 1 : 41 Fa= 41 : 0.975 ]( 41 , 41 , 0 ),<|>Tot Used: 82 , Added: 41 , Zero Std: 0 , Max Cor: 0.974
#>
2 <R=0.974,w= 1,N= 82>, Top: 18( 1 )[ 1 : 18 Fa= 58 : 0.962 ]( 18 , 19 , 41 ),<|>Tot Used: 118 , Added: 19 , Zero Std: 0 , Max Cor: 0.960
#>
3 <R=0.960,w= 1,N= 82>, Top: 8( 1 )[ 1 : 8 Fa= 66 : 0.955 ]( 8 , 8 , 58 ),<|>Tot Used: 134 , Added: 8 , Zero Std: 0 , Max Cor: 0.955
#>
4 <R=0.955,w= 2,N= 43>, Top: 21( 1 )[ 1 : 21 Fa= 84 : 0.927 ]( 21 , 21 , 66 ),<|>Tot Used: 173 , Added: 21 , Zero Std: 0 , Max Cor: 0.927
#>
5 <R=0.927,w= 2,N= 43>, Top: 8( 1 )[ 1 : 8 Fa= 89 : 0.914 ]( 8 , 9 , 84 ),<|>Tot Used: 187 , Added: 9 , Zero Std: 0 , Max Cor: 0.912
#>
6 <R=0.912,w= 2,N= 43>, Top: 3( 1 )[ 1 : 3 Fa= 90 : 0.906 ]( 3 , 3 , 89 ),<|>Tot Used: 191 , Added: 3 , Zero Std: 0 , Max Cor: 0.906
#>
7 <R=0.906,w= 3,N= 110>, Top: 50( 2 )[ 1 : 50 Fa= 129 : 0.853 ]( 49 , 54 , 90 ),<|>Tot Used: 278 , Added: 54 , Zero Std: 0 , Max Cor: 0.883
#>
8 <R=0.883,w= 3,N= 110>, Top: 11( 1 )[ 1 : 11 Fa= 135 : 0.841 ]( 11 , 11 , 129 ),<|>Tot Used: 290 , Added: 11 , Zero Std: 0 , Max Cor: 0.841
#>
9 <R=0.841,w= 4,N= 52>, Top: 25( 1 )[ 1 : 25 Fa= 142 : 0.800 ]( 24 , 26 , 135 ),<|>Tot Used: 306 , Added: 26 , Zero Std: 0 , Max Cor: 0.929
#>
10 <R=0.929,w= 4,N= 52>, Top: 3( 1 )[ 1 : 3 Fa= 144 : 0.814 ]( 3 , 3 , 142 ),<|>Tot Used: 306 , Added: 3 , Zero Std: 0 , Max Cor: 0.810
#>
11 <R=0.810,w= 5,N= 4>, Top: 2( 1 )[ 1 : 2 Fa= 144 : 0.800 ]( 2 , 2 , 144 ),<|>Tot Used: 306 , Added: 2 , Zero Std: 0 , Max Cor: 0.798
#>
12 <R=0.798,w= 6,N= 0>
#>
[ 12 ], 0.7945852 Decor Dimension: 306 . Cor to Base: 161 , ABase: 118 , Outcome Base: 0
#>
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
489
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
330
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.9
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
4.61
if (!largeSet)
{
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
}
if (!largeSet)
{
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = cexheat,
cexCol = cexheat,
srtCol=45,
srtRow=45,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
diag(cormat) <- 0
print(max(abs(cormat)))
}
[1]
0.798365
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : mean_jerk_in_air6 200 : disp_index12 300 : La_mean_speed_in_air17 400 : La_gmrt_on_paper23
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| total_time23 | 0.767 | 0.909 | -0.366 | 0.736 | 6.93e-05 | 0.863 |
| total_time15 | 0.775 | 1.062 | -0.442 | 0.572 | 4.78e-01 | 0.844 |
| air_time23 | 0.599 | 0.766 | -0.374 | 0.715 | 2.31e-02 | 0.844 |
| air_time15 | 0.684 | 1.112 | -0.506 | 0.669 | 7.09e-01 | 0.829 |
| total_time17 | 0.806 | 1.082 | -0.400 | 0.966 | 3.10e-02 | 0.824 |
| paper_time23 | 0.690 | 1.106 | -0.435 | 0.703 | 6.55e-01 | 0.814 |
| air_time17 | 0.674 | 0.980 | -0.378 | 0.863 | 8.86e-02 | 0.806 |
| paper_time17 | 0.664 | 1.045 | -0.413 | 0.929 | 1.79e-01 | 0.796 |
| total_time6 | 0.680 | 1.069 | -0.364 | 0.665 | 7.13e-01 | 0.790 |
| air_time16 | 0.426 | 0.841 | -0.414 | 0.650 | 8.51e-01 | 0.787 |
topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]
theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]
pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| total_time23 | 0.7669 | 0.9089 | -0.3659 | 0.7356 | 6.93e-05 | 0.863 |
| total_time15 | 0.7754 | 1.0623 | -0.4417 | 0.5721 | 4.78e-01 | 0.844 |
| total_time17 | 0.8059 | 1.0820 | -0.4001 | 0.9658 | 3.10e-02 | 0.824 |
| paper_time17 | 0.6641 | 1.0452 | -0.4133 | 0.9288 | 1.79e-01 | 0.796 |
| total_time6 | 0.6804 | 1.0693 | -0.3643 | 0.6653 | 7.13e-01 | 0.790 |
| disp_index23 | 0.5808 | 0.9243 | -0.3531 | 0.8162 | 3.70e-01 | 0.787 |
| total_time7 | 0.5542 | 0.8587 | -0.2404 | 0.8645 | 7.47e-03 | 0.785 |
| total_time16 | 0.5332 | 0.8335 | -0.3041 | 0.6411 | 1.78e-01 | 0.784 |
| total_time22 | 0.6670 | 1.0498 | -0.3412 | 0.6015 | 4.94e-01 | 0.780 |
| gmrt_in_air7 | -0.4478 | 0.8115 | 0.4227 | 0.7944 | 9.97e-01 | 0.775 |
| La_total_time2 | 0.3630 | 0.6054 | -0.0507 | 0.4814 | 2.31e-01 | 0.707 |
| La_disp_index21 | -0.4089 | 0.6245 | -0.0181 | 0.3042 | 8.89e-01 | 0.706 |
| La_air_time21 | 0.0669 | 0.3949 | -0.1956 | 0.3518 | 2.26e-01 | 0.706 |
| La_paper_time6 | 0.2865 | 0.6900 | -0.1515 | 0.5442 | 4.14e-01 | 0.698 |
| La_mean_jerk_in_air7 | -0.0162 | 0.0512 | -0.0572 | 0.0722 | 2.62e-01 | 0.690 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.07 | 182 | 0.404 |
allSigvars <- names(dc)
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| total_time23 | 0.7669 | 0.9089 | -0.3659 | 0.7356 | 6.93e-05 | 0.863 | 0.863 | 2 | |
| total_time15 | 0.7754 | 1.0623 | -0.4417 | 0.5721 | 4.78e-01 | 0.844 | 0.844 | 1 | |
| total_time17 | 0.8059 | 1.0820 | -0.4001 | 0.9658 | 3.10e-02 | 0.824 | 0.824 | 1 | |
| paper_time17 | 0.6641 | 1.0452 | -0.4133 | 0.9288 | 1.79e-01 | 0.796 | 0.796 | 1 | |
| total_time6 | 0.6804 | 1.0693 | -0.3643 | 0.6653 | 7.13e-01 | 0.790 | 0.790 | 1 | |
| total_time2 | NA | 0.5383 | 0.9470 | -0.4676 | 0.7818 | 7.26e-01 | 0.787 | 0.787 | NA |
| disp_index23 | 0.5808 | 0.9243 | -0.3531 | 0.8162 | 3.70e-01 | 0.787 | 0.787 | NA | |
| total_time7 | 0.5542 | 0.8587 | -0.2404 | 0.8645 | 7.47e-03 | 0.785 | 0.785 | 1 | |
| total_time16 | 0.5332 | 0.8335 | -0.3041 | 0.6411 | 1.78e-01 | 0.784 | 0.784 | 2 | |
| total_time22 | 0.6670 | 1.0498 | -0.3412 | 0.6015 | 4.94e-01 | 0.780 | 0.780 | 2 | |
| gmrt_in_air7 | -0.4478 | 0.8115 | 0.4227 | 0.7944 | 9.97e-01 | 0.775 | 0.775 | 1 | |
| paper_time6 | NA | 0.6807 | 1.2538 | -0.2913 | 0.7108 | 8.96e-01 | 0.729 | 0.729 | NA |
| disp_index2 | NA | 0.2160 | 0.8693 | -0.5135 | 1.0058 | 3.28e-01 | 0.719 | 0.719 | 2 |
| La_total_time2 | -0.812disp_index2 + 1.000total_time2 | 0.3630 | 0.6054 | -0.0507 | 0.4814 | 2.31e-01 | 0.707 | 0.787 | 0 |
| La_disp_index21 | + 1.000disp_index21 -0.906paper_time21 | -0.4089 | 0.6245 | -0.0181 | 0.3042 | 8.89e-01 | 0.706 | 0.538 | -1 |
| La_air_time21 | + 1.000air_time21 -0.847num_of_pendown21 | 0.0669 | 0.3949 | -0.1956 | 0.3518 | 2.26e-01 | 0.706 | 0.688 | -1 |
| La_paper_time6 | + 0.840mean_speed_on_paper6 + 1.000paper_time6 | 0.2865 | 0.6900 | -0.1515 | 0.5442 | 4.14e-01 | 0.698 | 0.729 | 0 |
| mean_jerk_in_air7 | NA | -0.3593 | 0.8283 | 0.1846 | 1.0872 | 7.51e-02 | 0.695 | 0.695 | NA |
| mean_acc_in_air7 | NA | -0.3322 | 0.7984 | 0.2342 | 1.0407 | 1.95e-01 | 0.695 | 0.695 | 1 |
| La_mean_jerk_in_air7 | -1.033mean_acc_in_air7 + 1.000mean_jerk_in_air7 | -0.0162 | 0.0512 | -0.0572 | 0.0722 | 2.62e-01 | 0.690 | 0.695 | -1 |
| air_time21 | NA | 0.2100 | 0.8146 | -0.3194 | 0.7050 | 7.65e-01 | 0.688 | 0.688 | NA |
| mean_speed_on_paper6 | NA | -0.4693 | 1.2166 | 0.1664 | 0.8458 | 8.52e-01 | 0.653 | 0.653 | 3 |
| num_of_pendown21 | NA | 0.1689 | 0.8947 | -0.1461 | 0.7239 | 3.54e-01 | 0.609 | 0.609 | 2 |
| paper_time21 | NA | 0.1398 | 1.1738 | -0.0900 | 1.0840 | 8.16e-01 | 0.542 | 0.542 | NA |
| disp_index21 | NA | -0.2822 | 1.3004 | -0.0996 | 0.9807 | 7.01e-02 | 0.538 | 0.538 | NA |